# DEBUG UNTIL CONTAINER UPDATEDchooseCRANmirror(ind =1)install.packages('matrixStats', lib=".")# ENDDEBUGlibrary(dplyr)# Ensure infix operator is available, methods should still reference dplyr namespace otherwiseoptions(dplyr.summarise.inform =FALSE)# Don't print out '`summarise()` has grouped output by 'group'. You can override using the `.groups` argument.'if(is.null(params$runsheet)){stop("PARAMETERIZATION ERROR: Must supply runsheet path")}runsheet=params$runsheet# <path/to/runsheet>annotation_file_path<-as.character(params$annotation_file_path)# Annotation file from 'genelab_annots_link' column of https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csvorganism<-params$organism# Used to determine primary keytypemessage(params)
# fileEncoding removes strange characters from the column namesdf_rs<-read.csv(runsheet, check.names =FALSE, fileEncoding ='UTF-8-BOM')# NON_DPPD:STARTprint("Here is the embedded runsheet")
# NON_DPPD:ENDprint("Loading Raw Data...")# NON_DPPD
[1] "Loading Raw Data..."
Code
# TODO: priority-low generalize this utility functionallTrue<-function(i_vector){if(length(i_vector)==0){stop(paste("Input vector is length zero"))}all(i_vector)}# Define paths to raw data filesrunsheetPathsAreURIs<-function(df_runsheet){allTrue(stringr::str_starts(df_runsheet$`Array Data File Path`, "https"))}# Download raw data filesdownloadFilesFromRunsheet<-function(df_runsheet){urls<-df_runsheet$`Array Data File Path`destinationFiles<-df_runsheet$`Array Data File Name`mapply(function(url, destinationFile){print(paste0("Downloading from '", url, "' TO '", destinationFile, "'"))if(file.exists(destinationFile)){warning(paste("Using Existing File:", destinationFile))}else{download.file(url, destinationFile)}}, urls, destinationFiles)destinationFiles# Return these paths}if(runsheetPathsAreURIs(df_rs)){print("Determined Raw Data Locations are URIS")local_paths<-downloadFilesFromRunsheet(df_rs)}else{print("Or Determined Raw Data Locations are local paths")local_paths<-df_rs$`Array Data File Path`}
[1] "Determined Raw Data Locations are URIS"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775174_A_Pre-1_1.txt.gz' TO 'GLDS-542_microarray_GSM2775174_A_Pre-1_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775175_B_Pre-1_1.txt.gz' TO 'GLDS-542_microarray_GSM2775175_B_Pre-1_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775176_C_Pre-1_1.txt.gz' TO 'GLDS-542_microarray_GSM2775176_C_Pre-1_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775177_D_Pre-1_1.txt.gz' TO 'GLDS-542_microarray_GSM2775177_D_Pre-1_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775178_E_Pre-1_1.txt.gz' TO 'GLDS-542_microarray_GSM2775178_E_Pre-1_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775179_F_Pre-1_1.txt.gz' TO 'GLDS-542_microarray_GSM2775179_F_Pre-1_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775180_G_Pre-1_1.txt.gz' TO 'GLDS-542_microarray_GSM2775180_G_Pre-1_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775181_A_C14_1.txt.gz' TO 'GLDS-542_microarray_GSM2775181_A_C14_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775182_B_C14_1.txt.gz' TO 'GLDS-542_microarray_GSM2775182_B_C14_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775183_C_C14_1.txt.gz' TO 'GLDS-542_microarray_GSM2775183_C_C14_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775184_D_C14_1.txt.gz' TO 'GLDS-542_microarray_GSM2775184_D_C14_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775185_E_C14_1.txt.gz' TO 'GLDS-542_microarray_GSM2775185_E_C14_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775186_F_C14_1.txt.gz' TO 'GLDS-542_microarray_GSM2775186_F_C14_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775187_G_C14_1.txt.gz' TO 'GLDS-542_microarray_GSM2775187_G_C14_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775188_A_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775188_A_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775189_B_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775189_B_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775190_C_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775190_C_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775191_D_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775191_D_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775192_E_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775192_E_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775193_F_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775193_F_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775194_G_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775194_G_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775195_H_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775195_H_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775196_A_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775196_A_C14_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775197_B_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775197_B_C14_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775198_C_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775198_C_C14_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775199_D_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775199_D_C14_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775200_E_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775200_E_C14_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775201_F_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775201_F_C14_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775202_G_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775202_G_C14_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775203_H_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775203_H_C14_2.txt.gz'"
Code
# Decompress files if neededif(allTrue(stringr::str_ends(local_paths, ".gz"))){print("Determined these files are gzip compressed... Decompressing now")# This does the decompressionlapply(local_paths, R.utils::gunzip, remove =FALSE, overwrite =TRUE)# This removes the .gz extension to get the decompressed filenameslocal_paths<-vapply(local_paths, stringr::str_replace, # Run this function against each item in 'local_paths' FUN.VALUE =character(1), # Execpt an character vector as a return USE.NAMES =FALSE, # Don't use the input to assign names for the returned list pattern =".gz$", # first argument for applied function replacement =""# second argument for applied function)}
[1] "Determined these files are gzip compressed... Decompressing now"
Density of raw intensities for each array. These are raw intensity values with background intensity values subtracted. A lack of overlap indicates a need for normalization.
3.2 Pseudo Image Plots
Code
agilentImagePlot<-function(eListRaw){# Adapted from this discussion: https://support.bioconductor.org/p/15523/copy_raw_data<-eListRawcopy_raw_data$genes$Block<-1# Agilent arrays only have one blocknames(copy_raw_data$genes)[2]<-"Column"copy_raw_data$printer<-limma::getLayout(copy_raw_data$genes)r<-copy_raw_data$genes$Rowc<-copy_raw_data$genes$Columnnr<-max(r)nc<-max(c)y<-rep(NA,nr*nc)i<-(r-1)*nc+cfor(array_iinseq(colnames(copy_raw_data$E))){y[i]<-log2(copy_raw_data$E[,array_i])limma::imageplot(y,copy_raw_data$printer, main =rownames(copy_raw_data$targets)[array_i])}}agilentImagePlot(raw_data)
3.3 MA Plots
Code
for(array_iinseq(colnames(raw_data$E))){message(glue::glue("MA Plot for array: {array_i} of {length(colnames(raw_data$E))}"))# NON_DPPDsample_name<-rownames(raw_data$targets)[array_i]limma::plotMA(raw_data,array=array_i,xlab="Average log-expression",ylab="Expression log-ratio (this sample vs. others)", main =sample_name, status=raw_data$genes$ControlType)}
3.4 Foreground-Background Plots
Code
for(array_iinseq(colnames(raw_data$E))){message(glue::glue("FB Plot for array: {array_i} of {length(colnames(raw_data$E))}"))# NON_DPPDsample_name<-rownames(raw_data$targets)[array_i]limma::plotFB(raw_data, array =array_i, xlab ="log2 Background", ylab ="log2 Foreground", main =sample_name)}
3.5 Boxplots
Code
boxplotExpressionSafeMargin<-function(data){# NON_DPPD:START#' plot boxplots of expression values#'#' Ensures the plot labels are vertical and fit the plot#' @param data: limma::EListRaw or limma::EList# NON_DPPD:ENDlongest_sample_name_length<-max(nchar(rownames(data$targets)))*1bottom_margin<-min(35, longest_sample_name_length)par(mar=c(bottom_margin,2,1,1))boxplot(log2(data$E), las=2)}boxplotExpressionSafeMargin(raw_data)
# Normalize background-corrected data using the quantile methodnorm_data<-limma::normalizeBetweenArrays(norm_data, method ="quantile")print("Summarized Normalized Data Below")# NON_DPPD
[1] "Summarized Normalized Data Below"
Code
print("Note: These are expected to be the same values as the normalized data since no filtering/summarization has been performed")# NON_DPPD
[1] "Note: These are expected to be the same values as the normalized data since no filtering/summarization has been performed"
Code
# Summarize background-corrected and normalized dataprint(paste0("Number of Arrays: ", dim(norm_data)[2]))
Density of norm intensities for each array. Near complete overlap is expected after normalization.
6.2 Pseudo Image Plots
Code
agilentImagePlot(norm_data)
6.3 MA Plots
Code
for(array_iinseq(colnames(norm_data$E))){sample_name<-rownames(norm_data$targets)[array_i]limma::plotMA(norm_data,array=array_i,xlab="Average log-expression",ylab="Expression log-ratio (this sample vs. others)", main =sample_name, status=norm_data$genes$ControlType)}
6.4 Boxplots
Code
boxplotExpressionSafeMargin(norm_data)
7 Perform Probeset Differential Expression
7.1 Approach Motivation
Based on bioconductor discussions with Gordon Smyth (“[Smyth’s] research group created the limma, edgeR, goseq, Rsubread, csaw and diffHic packages.”) here: https://support.bioconductor.org/p/116616/#116674
Summarization using avereps is ‘deliberately designed for cases when each probe is associated with exactly one gene’.
Given good quality gene annotations (which this analysis should have by using bioMart Ensembl Gene annotations), ‘most Agilent probes should map to a unique gene’.
Based on this discussion, the folowing approach is utilized:
Add biomart annotations first
Compute all mapping statistics (i.e. multimapping (one-probe to many-genes), redudant mapping (many-probes to one-gene))
Perform DE with and without filtering (assessing effects of filtering on row-wise results)
Don’t summarize the results from the probe level since any probe with a single gene mapping being DE implies the gene a DEG. https://support.bioconductor.org/p/93796/#116605
message(paste0("Expected dataset name: '", expected_dataset_name, "'"))# NON_DPPD# Specify Ensembl version used in current GeneLab reference annotationsENSEMBL_VERSION<-'107'print(paste0("Searching for Ensembl Version: ", ENSEMBL_VERSION))# NON_DPPD
[1] "Searching for Ensembl Version: 107"
Code
ensembl<-biomaRt::useEnsembl(biomart ="genes", dataset =expected_dataset_name, version =ENSEMBL_VERSION)
Warning in listEnsemblArchives(https = FALSE): Ensembl will soon enforce the use of https.
As such the 'https' argument will be deprecated in the next release.
Object of class 'Mart':
Using the ENSEMBL_MART_ENSEMBL BioMart database
Using the hsapiens_gene_ensembl dataset
Code
getBioMartAttribute<-function(df_rs, params){#' Returns resolved biomart attribute# NON_DPPD:START#' this either comes from the runsheet or as a fall back, the parameters injected during render#' if neither exist, an error is thrown# NON_DPPD:END# check if runsheet has Array Design REFif(!is.null(df_rs$`biomart_attribute`)){print("Using attribute name sourced from runsheet")# Format according to biomart needsformatted_value<-unique(df_rs$`biomart_attribute`)%>%stringr::str_replace_all(" ","_")%>%# Replace all spaces with underscorestringr::str_to_lower()# Lower casing onlyreturn(formatted_value)}else{print("Could not find 'Array Design REF' in runsheet, falling back to parameters")}# check if a fallback has been given via paramsif(!is.null(params$biomart_attribute)){print("Using attribute name sourced from parameters")return(params$biomart_attribute)}# finally throw an error if neither guard condition was truestop("No valid biomart attribute identified")}expected_attribute_name<-getBioMartAttribute(df_rs, params)
Warning: DEBUG MODE: Limiting query to 300 entries
Code
# DEBUG:END# Create probe map# Run Biomart Queries in chunks to prevent request timeouts# Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk sizeCHUNK_SIZE=8000probe_id_chunks<-split(probe_ids, ceiling(seq_along(probe_ids)/CHUNK_SIZE))df_mapping<-data.frame()for(iinseq_along(probe_id_chunks)){probe_id_chunk<-probe_id_chunks[[i]]print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})"))message(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})"))# NON_DPPDchunk_results<-biomaRt::getBM( attributes =c(expected_attribute_name,"ensembl_gene_id"), filters =expected_attribute_name, values =probe_id_chunk, mart =ensembl)df_mapping<-df_mapping%>%dplyr::bind_rows(chunk_results)Sys.sleep(10)# Slight break between requests to prevent back-to-back requests}
Running biomart query chunk 1 of 1. Total probes IDS in query (300)
Code
# Convert list of multi-mapped genes to stringlistToUniquePipedString<-function(str_list){#! convert lists into strings denoting unique elements separated by '|' characters#! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3"return(toString(unique(str_list))%>%stringr::str_replace_all(pattern =stringr::fixed(", "), replacement ="|"))}unique_probe_ids<-df_mapping%>%# note: '!!sym(VAR)' syntax allows usage of variable 'VAR' in dplyr functions due to NSE. ref: https://dplyr.tidyverse.org/articles/programming.html # NON_DPPDdplyr::group_by(!!sym(expected_attribute_name))%>%dplyr::summarise( ENSEMBL =listToUniquePipedString(ensembl_gene_id))%>%# Count number of ensembl IDS mappeddplyr::mutate( count_ENSEMBL_mappings =1+stringr::str_count(ENSEMBL, stringr::fixed("|")))norm_data$genes<-norm_data$genes%>%dplyr::left_join(unique_probe_ids, by =c("ProbeName"=expected_attribute_name))%>%dplyr::mutate( count_ENSEMBL_mappings =ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings))
7.3 Summarize Biomart Mapping vs. Manufacturer Mapping
Code
# Pie Chart with Percentagesslices<-c('Control probes'=nrow(norm_data$gene%>%dplyr::filter(ControlType!=0)%>%dplyr::distinct(ProbeName)), 'Unique Mapping'=nrow(norm_data$gene%>%dplyr::filter(ControlType==0)%>%dplyr::filter(count_ENSEMBL_mappings==1)%>%dplyr::distinct(ProbeName)), 'Multi Mapping'=nrow(norm_data$gene%>%dplyr::filter(ControlType==0)%>%dplyr::filter(count_ENSEMBL_mappings>1)%>%dplyr::distinct(ProbeName)), 'No Mapping'=nrow(norm_data$gene%>%dplyr::filter(ControlType==0)%>%dplyr::filter(count_ENSEMBL_mappings==0)%>%dplyr::distinct(ProbeName)))pct<-round(slices/sum(slices)*100)chart_names<-names(slices)chart_names<-glue::glue("{names(slices)} ({slices})")# add count to labelsschart_names<-paste(chart_names, pct)# add percents to labelschart_names<-paste(chart_names,"%",sep="")# ad % to labelspie(slices,labels =chart_names, col=rainbow(length(slices)), main=glue::glue("Biomart Mapping to Ensembl Primary Keytype\n {nrow(norm_data$gene %>% dplyr::distinct(ProbeName))} Total Unique Probes"))
runsheetToDesignMatrix<-function(runsheet_path){df=read.csv(runsheet_path)# get only Factor Value columnsfactors=as.data.frame(df[,grep("Factor.Value", colnames(df), ignore.case=TRUE)])colnames(factors)=paste("factor",1:dim(factors)[2], sep="_")# Load metadata from runsheet csv filecompare_csv=data.frame(sample_id =df[,c("Sample.Name")], factors)# Create data frame containing all samples and respective factorsstudy<-as.data.frame(compare_csv[,2:dim(compare_csv)[2]])colnames(study)<-colnames(compare_csv)[2:dim(compare_csv)[2]]rownames(study)<-compare_csv[,1]# Format groups and indicate the group that each sample belongs toif(dim(study)[2]>=2){group<-apply(study,1,paste,collapse =" & ")# concatenate multiple factors into one condition per sample}else{group<-study[,1]}group_names<-paste0("(",group,")",sep ="")# human readable group namesgroup<-sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", group)))# group naming compatible with R models, this maintains the default behaviour of make.names with the exception that 'X' is never prepended to group namesnames(group) <- group_namesnames(group)<-group_names# Format contrasts table, defining pairwise comparisons for all groupscontrast.names<-combn(levels(factor(names(group))),2)# generate matrix of pairwise group combinations for comparisoncontrasts<-apply(contrast.names, MARGIN=2, function(col)sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", stringr::str_sub(col, 2, -2)))))contrast.names<-c(paste(contrast.names[1,],contrast.names[2,],sep ="v"),paste(contrast.names[2,],contrast.names[1,],sep ="v"))# format combinations for output table files namescontrasts<-cbind(contrasts,contrasts[c(2,1),])colnames(contrasts)<-contrast.namessampleTable<-data.frame(condition=factor(group))rownames(sampleTable)<-df[,c("Sample.Name")]condition<-sampleTable[,'condition']names_mapping<-as.data.frame(cbind(safe_name =as.character(condition), original_name =group_names))design<-model.matrix(~0+condition)design_data<-list( matrix =design, mapping =names_mapping, groups =as.data.frame(cbind(sample =df[,c("Sample.Name")], group =group_names)), contrasts =contrasts)return(design_data)}# Loading metadata from runsheet csv filedesign_data<-runsheetToDesignMatrix(runsheet)design<-design_data$matrix# Write SampleTable.csv and contrasts.csv filewrite.csv(design_data$groups, "SampleTable.csv")write.csv(design_data$contrasts, "contrasts.csv")
7.5 Perform Individual Probe Level DE
Code
lmFitPairwise<-function(norm_data, design){#' Perform all pairwise comparisons#' Approach based on limma manual section 17.4 (version 3.52.4)fit<-limma::lmFit(norm_data, design)# Create Contrast Modelfit.groups<-colnames(fit$design)[which(fit$assign==1)]combos<-combn(fit.groups,2)contrasts<-c(paste(combos[1,],combos[2,],sep ="-"),paste(combos[2,],combos[1,],sep ="-"))# format combinations for limma:makeContrastscont.matrix<-limma::makeContrasts(contrasts=contrasts,levels=design)contrast.fit<-limma::contrasts.fit(fit, cont.matrix)contrast.fit<-limma::eBayes(contrast.fit,trend=TRUE,robust=TRUE)return(contrast.fit)}# Calculate resultsres<-lmFitPairwise(norm_data, design)DT::datatable(limma::topTable(res))# NON_DPPD
Code
# Print DE table, without filteringlimma::write.fit(res, adjust ='BH', file ="INTERIM.csv", row.names =FALSE, quote =TRUE, sep =",")
7.6 Add Additional Columns and Format DE Table
Code
## Reformat Table for consistency across DE analyses tables within GeneLab ### Read in DE table df_interim<-read.csv("INTERIM.csv")# Reformat column namesreformat_names<-function(colname, group_name_mapping){# NON_DPPD:START#! Converts from:#! "P.value.adj.conditionWild.Type...Space.Flight...1st.generation.conditionWild.Type...Ground.Control...4th.generation"#! to something like:#! "Adj.p.value(Wild Type & Space Flight & 1st generation)v(Wild Type & Ground Control & 4th generation)"#! Since two groups are expected to be replace, ensure replacements happen in pairs# Remove 'condition' from group names## This was introduced while creating design matrix# Rename other columns for consistency across genomics related DE outputs# NON_DPPD:ENDnew_colname<-colname%>%stringr::str_replace(pattern ="^P.value.adj.condition", replacement ="Adj.p.value_")%>%stringr::str_replace(pattern ="^P.value.condition", replacement ="P.value_")%>%stringr::str_replace(pattern ="^Coef.condition", replacement ="Log2fc_")%>%# This is the Log2FC as per: https://rdrr.io/bioc/limma/man/writefit.htmlstringr::str_replace(pattern ="^t.condition", replacement ="T.stat_")%>%stringr::str_replace(pattern =stringr::fixed("Genes.ProbeName"), replacement ="PROBEID")%>%stringr::str_replace(pattern =stringr::fixed("Genes.SYMBOL"), replacement ="SYMBOL")%>%stringr::str_replace(pattern =stringr::fixed("Genes.ENSEMBL"), replacement ="ENSEMBL")%>%stringr::str_replace(pattern =stringr::fixed("Genes.GOSLIM_IDS"), replacement ="GOSLIM_IDS")%>%stringr::str_replace(pattern =".condition", replacement ="v")# remap to group names before make.names was appliedfor(iinseq(nrow(group_name_mapping))){safe_name<-group_name_mapping[i,]$safe_nameoriginal_name<-group_name_mapping[i,]$original_namenew_colname<-new_colname%>%stringr::str_replace(pattern =stringr::fixed(safe_name), replacement =original_name)}return(new_colname)}df_interim<-df_interim%>%dplyr::rename_with(reformat_names, group_name_mapping =design_data$mapping)# Concatenate expression values for each sampledf_interim<-df_interim%>%dplyr::bind_cols(norm_data$E)## Add Group Wise Statistics ### Group mean and standard deviations for normalized expression values are computed and added to the tableunique_groups<-unique(design_data$group$group)for(iinseq_along(unique_groups)){current_group<-unique_groups[i]current_samples<-design_data$group%>%dplyr::group_by(group)%>%dplyr::summarize( samples =sort(unique(sample)))%>%dplyr::filter(group==current_group)%>%dplyr::pull()print(glue::glue("Computing mean and standard deviation for Group {i} of {length(unique_groups)}"))print(glue::glue("Group: {current_group}"))print(glue::glue("Samples in Group: '{toString(current_samples)}'"))# NON_DPPD:STARTmessage(glue::glue("Computing mean and standard deviation for Group {i} of {length(unique_groups)}"))message(glue::glue("Group: {current_group}"))message(glue::glue("Samples in Group: '{toString(current_samples)}'"))# NON_DPPD:ENDdf_interim<-df_interim%>%dplyr::mutate("Group.Mean_{current_group}":=rowMeans(select(., all_of(current_samples))),"Group.Stdev_{current_group}":=matrixStats::rowSds(as.matrix(select(., all_of(current_samples)))),)%>%dplyr::ungroup()%>%as.data.frame()}
Computing mean and standard deviation for Group 1 of 2
Group: (pre-confinement)
Samples in Group: 'GSM2775174, GSM2775175, GSM2775176, GSM2775177, GSM2775178, GSM2775179, GSM2775180, GSM2775188, GSM2775189, GSM2775190, GSM2775191, GSM2775192, GSM2775193, GSM2775194, GSM2775195'
Computing mean and standard deviation for Group 2 of 2
Group: (post-confinement for 14 days)
Samples in Group: 'GSM2775181, GSM2775182, GSM2775183, GSM2775184, GSM2775185, GSM2775186, GSM2775187, GSM2775196, GSM2775197, GSM2775198, GSM2775199, GSM2775200, GSM2775201, GSM2775202, GSM2775203'
Code
## Compute all sample mean and standard deviationprint(glue::glue("Computing mean and standard deviation for all samples"))
Computing mean and standard deviation for all samples
# These columns are data mapped to column PROBEID as per the original Manufacturer and can be linked as neededcolnames_to_remove=c("Genes.Row","Genes.Col","Genes.Start","Genes.Sequence","Genes.ControlType","Genes.ProbeName","Genes.GeneName","Genes.SystematicName","Genes.Description","AveExpr"# Replaced by 'All.mean' column# "Genes.count_ENSEMBL_mappings", Keep this)df_interim<-df_interim%>%dplyr::select(-any_of(colnames_to_remove))## Concatenate annotations for genes (for uniquely mapped probes) ##### Read in annotation table for the appropriate organism ###annot<-read.table(annotation_file_path, sep ="\t", header =TRUE, quote ="", comment.char ="",)# Join annotation table and uniquely mapped data# Determine appropriate keytypemap_primary_keytypes<-c('Caenorhabditis elegans'='ENSEMBL','Danio rerio'='ENSEMBL','Drosophila melanogaster'='ENSEMBL','Rattus norvegicus'='ENSEMBL','Saccharomyces cerevisiae'='ENSEMBL','Homo sapiens'='ENSEMBL','Mus sapiens'='ENSEMBL','Arabidopsis thaliana '='TAIR')df_interim<-merge(annot,df_interim, by =map_primary_keytypes[[organism]],# ensure all original dge rows are kept.# If unmatched in the annotation database, then fill missing with NAN all.y =TRUE)# Save to filewrite.csv(df_interim, "differential_expression.csv", row.names =FALSE)### Add columns needed to generate GeneLab visualization plots## Add column to indicate the sign (positive/negative) of log2fc for each pairwise comparisondf<-df_interimupdown_table<-sign(df[,grep("Log2fc_",colnames(df))])colnames(updown_table)<-gsub("Log2fc","Updown",grep("Log2fc_",colnames(df),value =TRUE))df<-cbind(df,updown_table)rm(updown_table)## Add column to indicate contrast significance with p <= 0.1sig.1_table<-df[,grep("P.value_",colnames(df))]<=.1colnames(sig.1_table)<-gsub("P.value","Sig.1",grep("P.value_",colnames(df),value =TRUE))df<-cbind(df,sig.1_table)rm(sig.1_table)## Add column to indicate contrast significance with p <= 0.05sig.05_table<-df[,grep("P.value_",colnames(df))]<=.05colnames(sig.05_table)<-gsub("P.value","Sig.05",grep("P.value_",colnames(df),value =TRUE))df<-cbind(df, sig.05_table)rm(sig.05_table)## Add columns for the volcano plot with p-value and adjusted p-valuelog_pval_table<-log2(df[,grep("P.value_", colnames(df))])colnames(log_pval_table)<-paste0("Log2_", colnames(log_pval_table))df<-cbind(df, log_pval_table)rm(log_pval_table)log_adj_pval_table<-log2(df[,grep("Adj.p.value_", colnames(df))])colnames(log_adj_pval_table)<-paste0("Log2_", colnames(log_adj_pval_table))df<-cbind(df, log_adj_pval_table)rm(log_adj_pval_table)write.csv(df, row.names =FALSE,"visualization_output_table.csv")### Generate and export PCA table for GeneLab visualization plots## Only use positive expression values, negative values can make up a small portion ( < 0.5% ) of normalized expression values and cannot be log transformedexp_raw<-log2(norm_data$E)# negatives get converted to NA
## Log same info into versions.txt fileversion_output_fn<-"versions.txt"cat(capture.output(sessionInfo()),"BioC_version_associated_with_R_version",toString(tools:::.BioC_version_associated_with_R_version()), file =version_output_fn, append =TRUE, sep ="\n")